# Install if not already installed (remove the comment if not installed)
 
#remotes::install_github('susanathey/causalTree') # Uncomment this to install the causalTree package
#remotes::install_github('grf-labs/sufrep') # Uncomment this to install the sufrep package

# Install packages that are not already installed from this list of libraries
library(causalTree)
library(sufrep)
library(causalTree)
library(dplyr)       # Data manipulation (0.8.0.1)
library(fBasics)     # Summary statistics (3042.89)
library(corrplot)    # Correlations (0.84)
library(psych)       # Correlation p-values (1.8.12)
library(grf)         # Generalized random forests (0.10.2)
library(rpart)       # Classification and regression trees, or CART (4.1-13)
library(rpart.plot)  # Plotting trees (3.0.6)
library(treeClust)   # Predicting leaf position for causal trees (1.1-7)
library(car)         # linear hypothesis testing for causal tree (3.0-2)
library(remotes)    # Install packages from github (2.0.1)
library(readr)       # Reading csv files (1.3.1)
library(tidyr)       # Database operations (0.8.3)
library(tibble)      # Modern alternative to data frames (2.1.1)
library(knitr)       # RMarkdown (1.21)
library(kableExtra)  # Prettier RMarkdown (1.0.1)
library(ggplot2)     # general plotting tool (3.1.0)
library(haven)       # read stata files (2.0.0)
library(aod)         # hypothesis testing (1.3.1)
library(evtree)      # evolutionary learning of globally optimal trees (1.0-7)
library(estimatr)    # simple interface for OLS estimation w/ robust std errors ()
library(glmnet)
library(splines)
library(lmtest)
library(sandwich)
library(ggplot2)
library(openxlsx)

  ############################ EDUCATION RANKFIRST ###############

  # Set the seed for reproducibility
  set.seed(12345)  

  #---------------------------------------#
  # Data Read
  #---------------------------------------#
    
  # Read the data. Here is the dataset for humans vs. control A, corresponding to 
  # those with preferences for education. The 170 observations that could not be 
  # contacted have been removed."
  data <- read_dta("MainData/CausalForestData.dta")
  
  # Drop rows containing missing values
  df <- na.omit(data)
  # Rename variables: 
  df <- df %>% rename(Y=education_rankfirst,W=treatment)
  
  #---------------------------------------#
  # Data Split 
  #---------------------------------------#
  
  #train
  train_fraction <- 1  # Use train_fraction % of the dataset to train our models. 
  
  n <- nrow(df)
  train_idx <- sample.int(n, replace=F, size=floor(n*train_fraction))
  df_train <- df[train_idx,]
  df_test <- df[-train_idx,]
  
  # Diving the data 50%-50% into splitting and estimation
  split_size <- floor(nrow(df_train) * 0.5)
  split_idx <- sample(nrow(df_train), replace=FALSE, size=split_size)
  
  # Make the splits
  df_split <- df_train[split_idx,]
  df_est <- df_train[-split_idx,]
  
  # Isolate the "treatment" as a matrix
  W <- as.matrix(df_train$W)
  
  # Covariates: 
  covariates <- c("sexo","prom_notas", "ptje_ranking", "agno_egreso", "municipal",
                  "subvencionado", "particular", "bajo_linea_pobreza", 
                  "padres_superior", "padres_incompleta", "padres_media_completa", "tecnico", "prom_cm_actual" )
  
  
  # formula
  fmla <- formula(paste0("~ 0 +", paste0(covariates, collapse="+")))
  X <- model.matrix(fmla, df_train)
  
  #From now on, my working dataset is df_train 
  n <- nrow(df_train)
  
  # Isolate the outcome as a matrix
  Y <- as.matrix(df_train$Y)
  
  #---------------------------------------#
  # Causal Forest Estimation 
  #---------------------------------------#
  
  cf <- causal_forest(X,Y,W,W.hat= mean(as.numeric(as.character(df_train$W))),
                      num.trees=n) 
  
  #---------------------------------------#
  # Predict values
  #---------------------------------------#
  
  # Get forest predictions. 
  tau.hat <- predict(cf)$predictions 
  m.hat <- cf$Y.hat  
  e.hat <- cf$W.hat  
  tau.hat <- cf$predictions  # tau(X) estimates
  
  
  #---------------------------------------#
  # CATE Histogram
  #---------------------------------------#
  
  # Set the PDF output
  pdf(file = "Output/Figure2.pdf", width = 7, height = 7)
  
  # Create the histogram
  hist(tau.hat, main = "", xlab = "")
  
  # Close the PDF device
  dev.off()
  
  